@@ -2919,6 +2919,20 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
2919
2919
// the form: D = A * B + C.
2920
2920
// A is sparse matrix, half the size of B, and is expanded using sparsity index.
2921
2921
2922
+ class AMDGPUSWmmacIntrinsicIdxReuse<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2923
+ Intrinsic<
2924
+ [CD], // %D
2925
+ [
2926
+ A, // %A
2927
+ B, // %B
2928
+ LLVMMatchType<0>, // %C
2929
+ Index, // %Sparsity index for A
2930
+ llvm_i1_ty, // matrix_a_reuse
2931
+ llvm_i1_ty, // matrix_b_reuse
2932
+ ],
2933
+ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]
2934
+ >;
2935
+
2922
2936
class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2923
2937
Intrinsic<
2924
2938
[CD], // %D
@@ -3602,6 +3616,161 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
3602
3616
[IntrNoMem, IntrSpeculatable]
3603
3617
>;
3604
3618
3619
+ // WMMA intrinsics.
3620
+ class AMDGPUWmmaIntrinsicModsAB<LLVMType AB, LLVMType CD> :
3621
+ Intrinsic<
3622
+ [CD], // %D
3623
+ [
3624
+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3625
+ AB, // %A
3626
+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3627
+ LLVMMatchType<1>, // %B
3628
+ LLVMMatchType<0>, // %C
3629
+ llvm_i1_ty, // matrix_a_reuse
3630
+ llvm_i1_ty, // matrix_b_reuse
3631
+ ],
3632
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
3633
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3634
+ >;
3635
+
3636
+ class AMDGPUWmmaIntrinsicModsC<LLVMType AB, LLVMType CD> :
3637
+ Intrinsic<
3638
+ [CD], // %D
3639
+ [
3640
+ AB, // %A
3641
+ LLVMMatchType<1>, // %B
3642
+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
3643
+ LLVMMatchType<0>, // %C
3644
+ llvm_i1_ty, // matrix_a_reuse
3645
+ llvm_i1_ty, // matrix_b_reuse
3646
+ ],
3647
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>,
3648
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3649
+ >;
3650
+
3651
+ class AMDGPUWmmaIntrinsicF4ModsC<LLVMType A, LLVMType B, LLVMType CD> :
3652
+ Intrinsic<
3653
+ [CD], // %D
3654
+ [
3655
+ A, // %A
3656
+ B, // %B
3657
+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
3658
+ LLVMMatchType<0>, // %C
3659
+ ],
3660
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
3661
+ >;
3662
+
3663
+ class AMDGPUWmmaIntrinsicModsAll<LLVMType AB, LLVMType CD> :
3664
+ Intrinsic<
3665
+ [CD], // %D
3666
+ [
3667
+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3668
+ AB, // %A
3669
+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3670
+ LLVMMatchType<1>, // %B
3671
+ llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
3672
+ LLVMMatchType<0>, // %C
3673
+ ],
3674
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
3675
+ >;
3676
+
3677
+ class AMDGPUWmmaIntrinsicModsAllReuse<LLVMType AB, LLVMType CD> :
3678
+ Intrinsic<
3679
+ [CD], // %D
3680
+ [
3681
+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3682
+ AB, // %A
3683
+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3684
+ LLVMMatchType<1>, // %B
3685
+ llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
3686
+ LLVMMatchType<0>, // %C
3687
+ llvm_i1_ty, // matrix_a_reuse
3688
+ llvm_i1_ty, // matrix_b_reuse
3689
+ ],
3690
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
3691
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3692
+ >;
3693
+
3694
+ // D and C are of different types.
3695
+ class AMDGPUWmmaIntrinsicModsAllDiff<LLVMType DstTy, LLVMType AB, LLVMType C> :
3696
+ Intrinsic<
3697
+ [DstTy], // %D
3698
+ [
3699
+ llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg
3700
+ AB, // %A
3701
+ llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg
3702
+ LLVMMatchType<1>, // %B
3703
+ llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs)
3704
+ C, // %C
3705
+ llvm_i1_ty, // matrix_a_reuse
3706
+ llvm_i1_ty, // matrix_b_reuse
3707
+ ],
3708
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
3709
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
3710
+ >;
3711
+
3712
+ defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
3713
+ def int_amdgcn_wmma_f64_16x16x4_f64 : AMDGPUWmmaIntrinsicModsAll<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3714
+ def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3715
+ def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3716
+ def int_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3717
+ def int_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3718
+ def int_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
3719
+ def int_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllDiff<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty>;
3720
+ def int_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3721
+ def int_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3722
+ def int_amdgcn_wmma_f32_16x16x64_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3723
+ def int_amdgcn_wmma_f32_16x16x64_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3724
+ def int_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3725
+ def int_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3726
+ def int_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3727
+ def int_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3728
+ def int_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3729
+ def int_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3730
+ def int_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3731
+ def int_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3732
+ def int_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3733
+ def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3734
+ def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3735
+ def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC<llvm_anyint_ty, llvm_anyfloat_ty>;
3736
+ def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB<llvm_anyint_ty, llvm_anyint_ty>;
3737
+ def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
3738
+ }
3739
+
3740
+ class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
3741
+ Intrinsic<
3742
+ [CD], // %D
3743
+ [
3744
+ llvm_i1_ty, // %A_mod: 0 - none, 1 - neg
3745
+ A, // %A
3746
+ llvm_i1_ty, // %B_mod: 0 - none, 1 - neg
3747
+ B, // %B
3748
+ LLVMMatchType<0>, // %C
3749
+ Index, // %Sparsity index for A
3750
+ llvm_i1_ty, // matrix_a_reuse
3751
+ llvm_i1_ty, // matrix_b_reuse
3752
+ ],
3753
+ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>]
3754
+ >;
3755
+
3756
+ defset list<Intrinsic> AMDGPUSWMMACIntrinsicsGFX1250 = {
3757
+ def int_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3758
+ def int_amdgcn_swmmac_f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3759
+ def int_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3760
+ def int_amdgcn_swmmac_bf16_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3761
+ def int_amdgcn_swmmac_bf16f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3762
+ def int_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3763
+ def int_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3764
+ def int_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3765
+ def int_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3766
+ def int_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3767
+ def int_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3768
+ def int_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3769
+ def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
3770
+ def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
3771
+ }
3772
+
3773
+
3605
3774
class AMDGPUTensorLoadStore:
3606
3775
Intrinsic<
3607
3776
[],
0 commit comments